Challenge 9

Author

Austin Liu

Data Import and Packages

library(tidyverse)
library(DT)

names <- read_csv(here::here("supporting_artifacts", 
                             "learning_targets",  
                             "Lab 9", 
                             "StateNames_A.csv"))
datatable(names)
Warning in instance$preRenderHook(instance): It seems your data is too big
for client-side DataTables. You may consider server-side processing: https://
rstudio.github.io/DT/server.html

Part 1: Summarizing and Visualizing Allisons

Question 1

names <- names |> 
  mutate(Sex = Gender)

allisonname <- names |> 
  select(Name, Year, State, Count, Sex) |> 
  group_by(State, Sex) |> 
  filter(Name == "Allison") |> 
  summarize(Count = sum(Count), .groups = "keep") |> 
  pivot_wider(names_from = Sex, values_from = Count) |> 
  mutate(M = coalesce(M, 0))

knitr::kable(allisonname[, 1:3], 
             col.names = c("State", 
                           "Total Sum of Female", 
                           "Total Sum of Male" ),
             "html")
State Total Sum of Female Total Sum of Male
AK 232 0
AL 1535 0
AR 1198 0
AZ 1880 0
CA 12413 0
CO 1594 0
CT 1099 0
DC 321 0
DE 294 0
FL 4455 0
GA 3257 0
HI 183 0
IA 1477 0
ID 451 0
IL 5110 0
IN 3067 0
KS 1283 0
KY 1905 20
LA 1209 0
MA 2218 0
MD 2229 0
ME 340 0
MI 4014 0
MN 2374 0
MO 2882 0
MS 817 0
MT 226 0
NC 3435 0
ND 285 0
NE 807 0
NH 412 0
NJ 3052 0
NM 399 0
NV 729 0
NY 5747 0
OH 5487 0
OK 1421 0
OR 1186 0
PA 4307 0
RI 306 0
SC 1228 0
SD 376 0
TN 2488 0
TX 10192 0
UT 1125 0
VA 3220 0
VT 135 0
WA 1956 0
WI 2367 0
WV 813 0
WY 142 0

Question 2

allisonname_F <- names |> 
  filter(Name == "Allison", Sex == "F")

Question 3

allisonname_f_byYear <- allisonname_F |> 
  group_by(Year) |> 
  summarize(Count = sum(Count))

  ggplot(data = allisonname_f_byYear, mapping = aes(x = Year, y = Count)) +
  geom_point() +
    geom_line() +
  labs(title = 'Popularity of the name "Allison" over time')

Part 2: Modeling the Number of Allisons

Question 4

Model1 <- allisonname_f_byYear |> 
  lm(Count ~ Year, data = _)

Question 5

Model1 |> 
  ggplot(mapping = aes(y = Count, x = Year)) +
  geom_point() +
  stat_smooth(method = "lm")
`geom_smooth()` using formula 'y ~ x'

Question 6

lm(Count ~ Year, data = allisonname_f_byYear)

Call:
lm(formula = Count ~ Year, data = allisonname_f_byYear)

Coefficients:
(Intercept)         Year  
   209689.8       -101.5  

y-hat (estimated Count of Allisons) = 209,689.8 - 101.5 (Year)

Question 7

Model1 |> 
  broom::augment() |> 
  ggplot(mapping = aes(y = .resid, x = .fitted)) +
  geom_point()

In the plot of the residuals against the fitted values, we do not see any discernible pattern.

Question 8

Our model shows that the name Allison is declining in popularity. Allison is still quite popular as about 5000 newborn babies were given that name in our most recent year of data.

Part 3: Spelling by State

Question 1

names |> 
  filter(Sex == "M", Name %in% c("Allan", "Alan", "Allen")) |> 
  group_by(Year, Name) |> 
  summarize(Count = sum(Count)) |> 
  ggplot(mapping = aes(x = Year, y = Count, color = Name)) +
    geom_point() +
    geom_line() +
  labs(title = 'Popularity of the name "Allen, Allan, Alan" over time')

Question 2

alan_name_M <- names |> 
  filter(Sex == "M", Name %in% c("Allan", "Alan", "Allen"),
         Year == 2000, State %in% c("PA", "CA")) |> 
  pivot_wider(names_from = Name, values_from = Count) |> 
  select(State, Alan, Allen, Allan)
alan_name_M
# A tibble: 2 × 4
  State  Alan Allen Allan
  <chr> <dbl> <dbl> <dbl>
1 CA      579   176   131
2 PA       51    56    12
knitr::kable(alan_name_M[, 1:4], 
             col.names = c("State",
                           "Count of Alan", 
                           "Count of Allen", 
                           "Count of Allan"),
             "html")
State Count of Alan Count of Allen Count of Allan
CA 579 176 131
PA 51 56 12

Question 3

alan_name_M_per <- names |> 
  filter(Sex == "M", Name %in% c("Allan", "Alan", "Allen"),
         Year == 2000, State %in% c("PA", "CA")) |> 
  group_by(State) |> 
  mutate(Count = Count/sum(Count)) |> 
  pivot_wider(names_from = Name, values_from = Count) |> 
  select(State, Alan, Allen, Allan)
alan_name_M_per
# A tibble: 2 × 4
# Groups:   State [2]
  State  Alan Allen Allan
  <chr> <dbl> <dbl> <dbl>
1 CA    0.653 0.199 0.148
2 PA    0.429 0.471 0.101
knitr::kable(alan_name_M_per[, 1:4], 
             col.names = c("State", 
                           "Percent by State named Alan", 
                           "Percent by State named Allen", 
                           "Percent by State named Allan") , 
             "html") %>%
  kableExtra::kable_styling(latex_options = "striped", font_size = 13)%>%
  kableExtra::row_spec(1:2, color = 'white', background = 'black')
State Percent by State named Alan Percent by State named Allen Percent by State named Allan
CA 0.6534989 0.1986456 0.1478555
PA 0.4285714 0.4705882 0.1008403